# -*- coding: utf-8 -*-
'''
Created on 2016年12月23日

@author: ZhuJiahui
'''

import os
import time
import numpy as np
from gensim import corpora, models
from file_utils.file_reader import read_to_2d_list, read_to_2_list
from topic_utils.distribution_util import get_real_topics
from file_utils.file_writer import quick_write_1d_to_text


def gensim_lda(contents_in_id, plain_word_id_list, latent_topic_number=10, topic_words_to_print=10):
    '''
    gensim中的LDA
    :param contents_in_id: 文档表示(2d str list)
    :param plain_word_id_list: 词汇列表 (1d str list) 词汇的形式与contens_in_id的形式要一致
    :param latent_topic_number: 潜在主题个数
    :param topic_words_to_print: 每个主题的词汇个数
    :return LDA模型(gensim LdaModel类型), 文档-主题分布(2d numpy array), 主题-词汇分布(2d numpy array)
    '''
    
    # 转化为gensim中的LDA的词汇列表输入格式
    word_id_list = [[each] for each in plain_word_id_list]
    dictionary = corpora.Dictionary(word_id_list)
    # 转化为gensim中的LDA的文档输入格式
    tf_corpus = [dictionary.doc2bow(text) for text in contents_in_id]
        
    # 运行LDA
    lda = models.LdaModel(tf_corpus, num_topics=latent_topic_number)
    
    # 获取文档-潜在主题分布矩阵
    THETA = np.zeros((len(tf_corpus), latent_topic_number))
    for j in range(len(tf_corpus)):
        for each1 in lda[tf_corpus[j]]:
            THETA[j, each1[0]] = each1[1]

    # 分布归一化
    temp_sum = np.sum(THETA, 1)
    for j in range(len(THETA)):
        THETA[j] = np.true_divide(THETA[j], temp_sum[j])
    
    # 获取潜在主题-词汇分布矩阵
    # 只获取概率值最高的前topic_words_to_print个，已经按照降序排序
    raw_topics = lda.show_topics(num_topics=latent_topic_number, num_words=topic_words_to_print, formatted=False)
    #[(0, [(word_id(str), probability(float)), (), (),...]), (1, ), ...]
    PHAI = np.zeros((latent_topic_number, len(word_id_list)))
    for j in range(latent_topic_number):
        for each in raw_topics[j][1]:
            # each是一个元组(word_id(str), probability(float))
            PHAI[j, int(each[0])] = each[1]
    
    # 分布归一化
    temp_sum = np.sum(PHAI, 1)
    for j in range(len(PHAI)):
        PHAI[j] = np.true_divide(PHAI[j], temp_sum[j])
    
    return lda, THETA, PHAI


def gensim_multicore_lda(contents_in_id, plain_word_id_list, latent_topic_number=10, topic_words_to_print=10):
    '''
    gensim中的多核LDA
    默认使用核数为CPU总核数-1
    :param contents_in_id: 文档表示(2d str list)
    :param plain_word_id_list: 词汇列表 (1d str list) 词汇的形式与contens_in_id的形式要一致
    :param latent_topic_number: 潜在主题个数
    :param topic_words_to_print: 每个主题的词汇个数
    :return LDA模型(gensim LdaModel类型), 文档-主题分布(2d numpy array), 主题-词汇分布(2d numpy array)
    '''
    
    # 转化为gensim中的LDA的词汇列表输入格式
    word_id_list = [[each] for each in plain_word_id_list]
    dictionary = corpora.Dictionary(word_id_list)
    # 转化为gensim中的LDA的文档输入格式
    tf_corpus = [dictionary.doc2bow(text) for text in contents_in_id]
        
    # 运行LDA
    lda = models.LdaMulticore(tf_corpus, num_topics=latent_topic_number)
    
    # 获取文档-潜在主题分布矩阵
    THETA = np.zeros((len(tf_corpus), latent_topic_number))
    for j in range(len(tf_corpus)):
        for each1 in lda[tf_corpus[j]]:
            THETA[j, each1[0]] = each1[1]

    # 分布归一化
    temp_sum = np.sum(THETA, 1)
    for j in range(len(THETA)):
        THETA[j] = np.true_divide(THETA[j], temp_sum[j])
    
    # 获取潜在主题-词汇分布矩阵
    # 只获取概率值最高的前topic_words_to_print个，已经按照降序排序
    raw_topics = lda.show_topics(num_topics=latent_topic_number, num_words=topic_words_to_print, formatted=False)
    PHAI = np.zeros((latent_topic_number, len(word_id_list)))
    for j in range(latent_topic_number):
        for each in raw_topics[j][1]:
            # each是一个元组(word_id(str), probability(float))
            PHAI[j, int(each[0])] = each[1]
    
    # 分布归一化
    temp_sum = np.sum(PHAI, 1)
    for j in range(len(PHAI)):
        PHAI[j] = np.true_divide(PHAI[j], temp_sum[j])
    
    return lda, THETA, PHAI


def lda_test1():
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    
    read_filename1 = root_directory + 'dataset/text_model/tm_corpus/44/44.docs'
    read_filename2 = root_directory + 'dataset/text_model/tm_corpus/44/44.vocab'
    
    contents_in_id = read_to_2d_list(read_filename1, " ")
    plain_word_id_list, word_list = read_to_2_list(read_filename2, ":")
    
    # 运行gensim LDA并计时
    start_time = time.clock()
    _, THETA, PHAI = gensim_lda(contents_in_id, plain_word_id_list, 50, topic_words_to_print=10)
    this_time = time.clock() - start_time
    print(this_time)
    
    real_topics = get_real_topics(PHAI, word_list)
    for each in real_topics:
        print(each)


def lda_test2():
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    
    read_filename = root_directory + 'dataset/lda_test/mat.txt'
    
    contents_in_id = read_to_2d_list(read_filename, ",")
    plain_word_id_list = [str(i) for i in range(368099)]
    
    # 运行gensim LDA并计时
    start_time = time.clock()
    _, THETA, PHAI = gensim_lda(contents_in_id, plain_word_id_list, 50, topic_words_to_print=10)
    this_time = time.clock() - start_time
    print(this_time)
    
    real_topics = get_real_topics(PHAI, plain_word_id_list)
    for each in real_topics:
        print(each)

def lda_test3():
    
    now_directory = os.getcwd()
    root_directory = os.path.dirname(now_directory) + '/'
    read_directory = root_directory + 'dataset/text_model/tm_corpus'
    
    latent_topic_number = 20
    write_directory = root_directory + 'dataset/lda'
    write_directory1 = root_directory + 'dataset/lda/feed_topic' + str(latent_topic_number)
    write_directory2 = root_directory + 'dataset/lda/topic_word' + str(latent_topic_number)
    write_directory3 = root_directory + 'dataset/lda/real_topic' + str(latent_topic_number)
    
    if (not(os.path.exists(write_directory))):
        os.mkdir(write_directory)
    if (not(os.path.exists(write_directory1))):
        os.mkdir(write_directory1)
    if (not(os.path.exists(write_directory2))):
        os.mkdir(write_directory2)
    if (not(os.path.exists(write_directory3))):
        os.mkdir(write_directory3)
    
    sp = 10
    # sp2 = 3

    for i in range(60, 70):

        contents_in_id = read_to_2d_list(read_directory + '/' + str(i) + '/' + str(i) + '.docs', " ")
        plain_word_id_list, word_list = read_to_2_list(read_directory + '/' + str(i) + '/' + str(i) + '.vocab', ":")

        # 运行gensim LDA并计时
        start_time = time.clock()
        _, THETA, PHAI = gensim_lda(contents_in_id, plain_word_id_list, latent_topic_number, topic_words_to_print=sp)
        this_time = time.clock() - start_time

        # 分布稀疏化
        # SPTW_vsm = sparse_DTvsm(PHAI, sp)
        # SPDT_vsm = sparse_DTvsm(THETA, sp2)
        
        # 真实的主题词短语
        real_topics = get_real_topics(PHAI, word_list)
        
        PHAI_to_string = []
        for j in range(len(PHAI)):
            str_line = " ".join([str(x) for x in PHAI[j]])
            PHAI_to_string.append(str_line)
        
        THETA_to_string = []
        for j in range(len(THETA)):
            str_line = " ".join([str(x) for x in THETA[j]])
            THETA_to_string.append(str_line)
            
        quick_write_1d_to_text(write_directory1 + '/' + str(i) + '.txt', THETA_to_string)
        quick_write_1d_to_text(write_directory2 + '/' + str(i) + '.txt', PHAI_to_string)
        quick_write_1d_to_text(write_directory3 + '/' + str(i) + '.txt', real_topics)
        
        print(this_time)

if __name__ == '__main__':   
    lda_test3()
